The following datasets are used in this demonstration:
The datasets, code and report are also available altogether in my GitHub repository here.
If you would like the full html report which contains interactive plots, it is available for download at https://github.com/bz-dev/ox-interview/blob/main/notebook/data_vis_full_output.html
Population and household estimates (univariate)
1.1 Gender ratio
Basic pie chart
1.2 Population and gender ratio by region
How to use subplots to group charts together, and how to use annotations to add more details to the plot.
1.3 Population by outward postcode using choropleth map
How to plot location related data on a map using geojson.
1.4 Population by postcode parent area using choropleth map
How to merge granulated geo areas and plot on the map.
House prices and earnings (multivariate)
2.1 Correlation between median house price and median household earning
How to use a combination of scatter plot, trendline, box plot and rug plot to show distributions and correlations between variables.
2.2 Median and lower quartile house prices from 2002 to 2020 (animated)
How to use animated chart to show trends over changes of one variable.
Text analysis and visualisation
3.1 Text tokenization and word cloud
How to perform a simple text tokenization and use word cloud to show word frequencies.
3.2 Basic sentiment analysis with VADER
How to perform a simple sentiment analysis and use grouped line charts.
Please run this code block before running any other blocks in this notebook.
# Install packages using jupyter notebook's built-in %pip function
%pip install pandas plotly geojson shapely openpyxl scipy nltk
# Import required packages
import pandas as pd
import numpy as np
import json
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re
import shapely.geometry
from shapely.ops import unary_union
import geojson
import itertools
import wordcloud
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Set default pandas plotting backend to Plotly
pd.options.plotting.backend = "plotly"
# Pass paths to resources to variables
file_population = Path("../data/ons/population_household.csv")
dir_geojson = Path("../data/geojson")
file_house_price = Path("../data/ons/house_price_earning.xlsx")
file_tweet = Path("../data/tweet/covid19_tweets.csv")
print("✅ Notebook preparation completed.")
Requirement already satisfied: pandas in d:\repository\ox-interview\venv\lib\site-packages (1.2.4) Requirement already satisfied: plotly in d:\repository\ox-interview\venv\lib\site-packages (4.14.3) Note: you may need to restart the kernel to use updated packages.Requirement already satisfied: geojson in d:\repository\ox-interview\venv\lib\site-packages (2.5.0) Requirement already satisfied: shapely in d:\repository\ox-interview\venv\lib\site-packages (1.7.1) Requirement already satisfied: openpyxl in d:\repository\ox-interview\venv\lib\site-packages (3.0.7) Requirement already satisfied: scipy in d:\repository\ox-interview\venv\lib\site-packages (1.6.3) Requirement already satisfied: nltk in d:\repository\ox-interview\venv\lib\site-packages (3.6.2) Requirement already satisfied: python-dateutil>=2.7.3 in d:\repository\ox-interview\venv\lib\site-packages (from pandas) (2.8.1) Requirement already satisfied: pytz>=2017.3 in d:\repository\ox-interview\venv\lib\site-packages (from pandas) (2021.1) Requirement already satisfied: numpy>=1.16.5 in d:\repository\ox-interview\venv\lib\site-packages (from pandas) (1.20.3) Requirement already satisfied: six>=1.5 in d:\repository\ox-interview\venv\lib\site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0) Requirement already satisfied: retrying>=1.3.3 in d:\repository\ox-interview\venv\lib\site-packages (from plotly) (1.3.3) Requirement already satisfied: et-xmlfile in d:\repository\ox-interview\venv\lib\site-packages (from openpyxl) (1.1.0) Requirement already satisfied: joblib in d:\repository\ox-interview\venv\lib\site-packages (from nltk) (1.0.1) Requirement already satisfied: regex in d:\repository\ox-interview\venv\lib\site-packages (from nltk) (2021.4.4) Requirement already satisfied: click in d:\repository\ox-interview\venv\lib\site-packages (from nltk) (8.0.0) Requirement already satisfied: tqdm in d:\repository\ox-interview\venv\lib\site-packages (from nltk) (4.60.0) Requirement already satisfied: colorama in d:\repository\ox-interview\venv\lib\site-packages (from click->nltk) (0.4.4) ✅ Notebook preparation completed.
# Declare section parent variable
var1 = dict()
# Load data
var1["df"] = pd.read_csv(file_population)
# Split postcode into outward and inward, e.g. LE3 9QP => LE3 and 9QP
var1["df"]["postcode_out"] = var1["df"]["Postcode"].apply(lambda x: x[0:4].strip())
var1["df"]["postcode_in"] = var1["df"]["Postcode"].apply(lambda x: x[4:].strip())
# Keep only non-numeric part in outward as the parent region
var1["df"]["postcode_region"] = var1["df"]["postcode_out"].apply(lambda x: re.split("\d+", x)[0])
# Remove column [Postcode] to save memory
var1["df"] = var1["df"].drop(["Postcode"], axis=1)
print("✅ Section 1 preparation completed.")
✅ Section 1 preparation completed.
# Declare section parent variable
sec1_1 = dict()
# Count total number of males and females
sec1_1["df"] = pd.DataFrame(
dict(Gender=["Male", "Female"], Count=[var1["df"]["Males"].sum(), var1["df"]["Females"].sum()]))
# Generate pie chart
sec1_1["fig"] = px.pie(sec1_1["df"], values="Count", names="Gender", title="National gender ratio (England and Wales)")
sec1_1["fig"].show()
# Remove variable to save memory
del sec1_1
# Declare section parent variable
sec1_2 = dict()
# Sum up all numbers within same postcode region, descending order by column [Total]
sec1_2["df"] = var1["df"].groupby(["postcode_region"]).sum().sort_values(by=['Total'], ascending=False).reset_index()
# Generate plots side by side
sec1_2["fig"] = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_xaxes=True,
shared_yaxes=False, vertical_spacing=0.001)
# Add plot 1: bar chart for top 10 population regions
sec1_2["fig"].append_trace(go.Bar(
x=sec1_2["df"].head(10)["Total"],
y=sec1_2["df"].head(10)["postcode_region"],
marker=dict(color='rgba(50, 171, 96, 0.6)', line=dict(
color='rgba(50, 171, 96, 1.0)',
width=1)),
name='Population',
orientation='h',
), 1, 1)
# Add plot 2: scatter chart for gender ratio in top 10 population regions
sec1_2["fig"].append_trace(
go.Scatter(
x=sec1_2["df"].head(10)["Males"] / sec1_2["df"].head(10)["Females"],
y=sec1_2["df"].head(10)["postcode_region"],
mode='lines+markers', line_color='rgb(128, 0, 128)', name='M/F ratio',
), 1, 2)
# Update layout changes on plot title, axes, color and legends
sec1_2["fig"].update_layout(
title='Male/Female ratio in top 10 population regions',
yaxis=dict(showgrid=False, showline=False, showticklabels=True, domain=[0, 0.85]),
yaxis2=dict(showgrid=False, showline=True, showticklabels=False,
linecolor='rgba(102, 102, 102, 0.8)', linewidth=2, domain=[0, 0.85]),
xaxis=dict(zeroline=False, showline=False, showticklabels=True,
showgrid=True, domain=[0, 0.42]),
xaxis2=dict(zeroline=False, showline=False, showticklabels=True, showgrid=True,
domain=[0.47, 1], side='top', dtick=25000),
legend=dict(x=0.029, y=1.038, font_size=10),
margin=dict(l=100, r=20, t=70, b=70),
paper_bgcolor='rgb(248, 248, 255)',
plot_bgcolor='rgb(248, 248, 255)',
)
# Add annotations to the plot
annotations = []
# Adding labels to charts
for a_mf, a_pop, x_pcr in zip(np.round(sec1_2["df"].head(10)["Males"] / sec1_2["df"].head(10)["Females"], decimals=2),
sec1_2["df"].head(10)["Total"],
sec1_2["df"].head(10)["postcode_region"]):
# Add label to M/F ratio scatter plot
annotations.append(dict(xref='x2', yref='y2', y=x_pcr, x=a_mf,
text=a_mf, xshift=50, showarrow=False))
# Add label to population bar chart
annotations.append(dict(xref='x1', yref='y1', y=x_pcr, x=a_pop,
text=f"{np.round(a_pop / 1000000, 3)}M",
xshift=25, showarrow=False))
sec1_2["fig"].update_layout(annotations=annotations)
sec1_2["fig"].show()
# Remove variables to save memory
del sec1_2
Concat individual postcode geojson mapping into single variable geojson_uk.
# Declare section parent variable
sec1_3 = dict()
# Create parent geojson collection object
sec1_3["geojson"] = dict(type="FeatureCollection", features=list())
# Load all geojson files
for f_geojson in list(dir_geojson.glob("*.geojson")):
with open(f_geojson) as f:
geojson_data = json.load(f)
for feature in geojson_data["features"]:
# Add feature id using properties.name, which is the outward postcode
feature["id"] = feature["properties"]["name"]
# Add feature to parent geojson collection
sec1_3["geojson"]["features"].append(feature)
# Sum up all numbers within same postcode outward area
sec1_3["df"] = var1["df"].groupby(["postcode_out"]).sum().reset_index()
# Get max population number among areas
sec1_3["p_max"] = sec1_3["df"]["Total"].max()
# Plot figure
sec1_3["fig"] = px.choropleth_mapbox(var1["df"].groupby(["postcode_out"]).sum().reset_index(),
geojson=sec1_3["geojson"],
locations='postcode_out', color='Total',
color_continuous_scale="jet",
range_color=(0, sec1_3["p_max"]),
mapbox_style="carto-positron",
zoom=4.5, center={"lat": 52.5, "lon": -1.6},
opacity=0.5)
sec1_3["fig"].update_layout(margin=dict(r=0, t=0, l=0, b=0))
sec1_3["fig"].show()
# Remove variables to save memory
del sec1_3